In [34]:
repo_pth = '../../'
#resource_pth = '../../opt/rfcx-data/'
resource_pth = '../../../resources/'

In [35]:
import os
import sys
from datetime import datetime
import numpy as np
import pandas
import seaborn
from matplotlib import pyplot as plt
import sqlite3
import pickle

add_paths = [repo_pth+'rfcx-worker-analysis/modules/domain_modules', repo_pth+'notebook-display']
for p in add_paths:
    if p not in sys.path:    
        sys.path.append(p)

import load_sound
import spectral_analysis
import fingerprinting
import sound_classification

from IPython.html.widgets import interactive, Checkbox, interact
from IPython.display import display, HTML
from IPython.html import widgets
from IPython.core.display import clear_output

import nbio

# cropped portion of sound file:
# seek_sec -15, +20

In [36]:
reload(load_sound)
reload(spectral_analysis)
reload(fingerprinting)
reload(sound_classification)
reload(nbio)
show = nbio.show
read_sound = load_sound.read_sound
write_sound = load_sound.write_sound
Sound = load_sound.Sound 
Spectrum = spectral_analysis.Spectrum
Profile = fingerprinting.Profile
SoundClassifier = sound_classification.SoundClassifier

def play(snd): 
    nbio.play(snd.data, snd.samplerate)
    
def read_sound(fp):
    fn = fp.split('/')[-1]
    name = fn.split('.')[0]
    gdate, time = name.split('T')
    time = time.replace('-',':')
    gid, y, m, d = gdate.split('-')
    date = '-'.join([y,m,d])
    dt = 'T'.join([date,time])
    meta_data = {'guardian_id':gid, 'start_time':dt}
    return load_sound.read_sound(fp, meta_data)


loaded nbio

In [37]:
event_fn = resource_pth+'events2.tsv'
data_pth = resource_pth+'wav/'

in_dir = sorted(os.listdir(data_pth))
print '%s files found in %s' % (len(set(in_dir)), data_pth)


459 files found in ../../../resources/wav/

Read the tsv into a dataframe

results file provided by Topher (some columns added by Paul) ignore those for which we don't have a data file


In [38]:
df = pandas.io.parsers.read_csv(
    event_fn, 
    sep ='\t',
    #delim_whitespace=True,
    parse_dates = ['time'],
    infer_datetime_format=True,
).groupby('has_file').get_group(True).copy()

Read the set of all fingerprints generated by current classifier


In [39]:
fips = pickle.load(open('current_classifier_fips.pkl'))
df['fips'] = fips
print len(fips)


1993

Filter out non-useful events


In [40]:
df2 = df.copy()
            
df2['has_fip'] = [f is not None for f in df2['fips']]
df2 = df2.groupby('has_fip').get_group(True).copy()

df2['not_gsm'] = [f['classification']!='GSM_Noise' for f in df2['fips']]
df2 = df2.groupby('not_gsm').get_group(True).copy()

#df2['has_alert'] = [type(e)!=type(None) for e in df2['vectors']]
#df2 = df2.groupby('has_alert').get_group(True)

df2['checked'] = df2['valid']!=4
df2 = df2.groupby('checked').get_group(1).copy()

df2['isnt_bad'] = [f!='22720ac238e1' for f in df2['guardian']]
df2 = df2.groupby('isnt_bad').get_group(True).copy()

print len(df2)


218

In [41]:
set(df2['guardian'])


Out[41]:
{'48043ef7e3bc',
 '5dbdf80c2085',
 '686862515160',
 '6a7b86c28a67',
 'b0eb9c751fa5',
 'd2f3d1c7cca5',
 'ed7f84df28eb'}

In [42]:
fips[0].keys()


Out[42]:
['time_interval',
 'moving_volume_fit',
 'classification',
 'fingerprint',
 'event_timestamp',
 'meta_data',
 'volume_power',
 'harmonic_intvl',
 'harmonic_power']

In [43]:
df2.iloc[0]


Out[43]:
type                                                         car
id                              48043ef7e3bc-2015-01-05T09-38-08
guardian                                            48043ef7e3bc
time                                         2015-01-05 09:38:55
unixtime                                              1420450735
weekday                                                      Mon
weekday_num                                                    1
hour                                                          10
day_night                                                    day
seek                                                        0:47
location                    48043ef7e3bc-2015-01-05T09-38-08.wav
seek_sec                                                      47
has_file                                                    True
start_seek                                                    42
stop_seek                                                     57
timestamp                                                  34735
checked                                                     True
valid                                                          1
fips           {u'time_interval': [5.20408163265, 9.285714285...
has_fip                                                     True
not_gsm                                                     True
isnt_bad                                                    True
Name: 0, dtype: object

Get additional data from the fingerprints dict and add them to the dataframe


In [44]:
df2['vectors']        = [fip['fingerprint']    for fip in df2['fips']]
df2['harmonic_intvl'] = [fip['harmonic_intvl'] for fip in df2['fips']]
df2['volume_power']   = [fip['volume_power']    for fip in df2['fips']]
df2['harmonic_power'] = [fip['harmonic_power']         for fip in df2['fips']]
df2['duration']       = [fip['time_interval'][1]-fip['time_interval'][0] for fip in df2['fips']]
df2['offset']         = [abs(15-fip['time_interval'][0])                 for fip in df2['fips']]
df2['classification'] = [fip['classification'] for fip in df2['fips']]

#df['time_interval'] = interval 

mv = [fip['moving_volume_fit'] for fip in df2['fips']]
lsq,vals = zip(*mv)
a,b,c = zip(*vals)
df2['fit_lsq'] = lsq
df2['fit_a'] = a
df2['fit_b'] = b
df2['fit_c'] = c

df2['harmonic_intvl_mean'] = [np.mean(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_std'] = [np.std(e) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dmean'] = [np.mean(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['harmonic_intvl_dstd'] = [np.std(np.abs(np.diff(e))) for e in df2['harmonic_intvl']]
df2['volume_mean'] = [np.mean(np.abs(e)) for e in df2['volume_power']]
df2['volume_max'] = [np.max(e) for e in df2['volume_power']]
df2['volume_std'] = [np.std(e) for e in df2['volume_power']]
df2['harmonic_mean'] = [np.mean(e) for e in df2['harmonic_power']]
df2['harmonic_max'] = [np.max(e) for e in df2['harmonic_power']]
df2['harmonic_std'] = [np.std(e) for e in df2['harmonic_power']]

In [45]:
df3 = df2.reindex(np.random.permutation(df2.index))
#df3 = df2

df3['valid'].dtype


Out[45]:
dtype('int64')

Run a ML classifier


In [46]:
from sklearn.svm import SVC, SVR
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from nolearn.dbn import DBN
#help(RandomForestClassifier)

In [48]:
t_idx = 'vectors'
vector_len = len(df3[t_idx][0])
r0 = np.empty((len(df3[t_idx]),vector_len))
for i,e in enumerate(df3[t_idx]):
    r0[i,:]=e
#print 'train', vehic_train.shape

features = [
    'harmonic_intvl_mean','harmonic_intvl_dmean',
    'harmonic_intvl_std', 'harmonic_intvl_dstd', 
    'duration',
    'fit_a','fit_b','fit_c',
    'fit_lsq', 
    'harmonic_mean','harmonic_max', 'harmonic_std', 
    'volume_mean', 'volume_std',
]

cols = [np.asarray(df3[s])[:,None] for s in features] +[r0]
vehic_train = np.hstack(cols)

vehic_train = r0



print 'train', vehic_train.shape, vehic_train.dtype

vehic_target = np.asarray(df3['valid'])
print 'target', vehic_target.shape, vehic_target.dtype

clf = DBN(
    [-1, 30, -1],
    learn_rates=0.3,
    learn_rate_decays=0.9,
    epochs=20,
    #verbose=1,
    )
#clf = linear_model.SGDClassifier()
#clf = SVC(kernel='poly')
#clf = MultinomialNB().fit(vehic_train, vehic_target)
#clf = RandomForestClassifier(n_estimators=100, class_weight ={0:1, 1:10})

split = int(.75* len(df3))

# train the classifier
X_train = vehic_train[:split]
X_test = vehic_train[split:]
y_train = vehic_target[:split]
y_test = vehic_train[split:]
clf.fit(X_train, y_train)

# run the classifier
y_pred = clf.predict(X_test)

vehic_predict = y_pred

from sklearn.metrics import classification_report
#from sklearn.metrics import zero_one_score

#print "Accuracy:", zero_one_score(y_test, y_pred)
print "Classification report:"
#print classification_report(y_test, y_pred)



# test the classifier
from sklearn import metrics


print metrics.confusion_matrix(vehic_target[split:], vehic_predict)
print 'predict', vehic_predict.sum()
print 'real', vehic_target[split:].sum()
try:
    for f,v in zip(features+range(vector_len), clf.feature_importances_):
        print f,v
    
except:
    pass
print r0.shape


train (218L, 20L) float64
target (218L,) int64
Classification report:
[[41  0]
 [14  0]]
predict 0
real 14
(218L, 20L)

In [92]:
dfx = df3.sort('harmonic_mean')
arr = np.asarray(dfx['valid'])
ham = [1]
nbio.show(np.convolve(arr,ham))



In [93]:
for i in range(5):
    ev = dfx.iloc[-i]
    fip = ev['fips']
    snd = read_sound(fip['meta_data']['filepath']).crop(ev.start_seek, ev.stop_seek)
    spc = Spectrum(snd)
    pro = Profile(spc)
    pro.get_harmonic_sound_bounds()
    nbio.show(pro.profile_plot())
    play(snd)
    print ev.location


../../rfcx-worker-analysis/modules/domain_modules\fingerprinting.py:171: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  overall_avg = np.average(a[0:stop*harmonics[-1],:], axis=0)
6a7b86c28a67-2015-02-16T08-44-02.wav
../../rfcx-worker-analysis/modules/domain_modules\fingerprinting.py:171: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  overall_avg = np.average(a[0:stop*harmonics[-1],:], axis=0)
686862515160-2015-02-28T11-18-41.wav
../../rfcx-worker-analysis/modules/domain_modules\fingerprinting.py:171: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  overall_avg = np.average(a[0:stop*harmonics[-1],:], axis=0)
48043ef7e3bc-2015-01-27T15-15-47.wav
../../rfcx-worker-analysis/modules/domain_modules\fingerprinting.py:171: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  overall_avg = np.average(a[0:stop*harmonics[-1],:], axis=0)
d2f3d1c7cca5-2015-02-28T09-05-42.wav
../../rfcx-worker-analysis/modules/domain_modules\fingerprinting.py:171: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  overall_avg = np.average(a[0:stop*harmonics[-1],:], axis=0)
686862515160-2015-02-28T11-26-43.wav

In [73]:
print np.mean(df_1['harmonic_mean'])
print np.mean(df_0['harmonic_mean'])


 4.4890653257
4.94344103012

In [40]:
for i,ev in df_0.iterrows():
    if ev['harmonic_mean']>15:
        fip = ev['fips']
        snd = read_sound(fip['meta_data']['filepath']).crop(ev.start_seek, ev.stop_seek)
        play(snd)
        #nbio.show(fip['harmonic_power'])
        plt.clf()
        plt.plot(fip['harmonic_power'],'.')
        nbio.show(plt.gcf())
        break



In [74]:
i = 1930
ev = df3.loc[i]
print ev
loc = ev['location']
seek = ev['seek_sec']
vol = ev['volume']
a = ev['fit_factor']
b = ev['fit_fac_b']
c = ev['fit_fac_c']
i0,i1 = ev['time_interval']
snd = read_sound(data_pth+loc, {}).crop(seek-15+i0,seek-15+i1)
x = np.arange(len(vol))
v2 = b/(a**2+(x-c)**2)
play(read_sound(data_pth+loc, {}))
print c
nbio.show(vol)
nbio.show(v2)


type                                                                truck
id                                       686862515160-2015-02-28T15-22-04
guardian                                                     686862515160
time                                                  2015-02-28 15:24:01
unixtime                                                       1425137041
weekday                                                               Sat
weekday_num                                                             6
hour                                                                   16
day_night                                                             day
seek                                                                 1:57
location                             686862515160-2015-02-28T15-22-04.wav
seek_sec                                                              117
has_file                                                             True
start_seek                                                            112
stop_seek                                                             127
timestamp                                                           55441
checked                                                              True
valid                                                                   1
vectors                 [0.0502897096196, 0.0828893397807, 0.043675664...
harmonic_intvl          [43.0773463432, 43.0897770054, 43.1022076677, ...
volume                  [1.09595629591, 0.94261829558, 0.957994416812,...
duration                                                          3.46537
has_alert                                                            True
offset                                                         0.01737017
time_interval                                     [14.9826298343, 18.448]
harmonic_intvl_mean                                              40.43108
harmonic_intvl_std                                               4.374429
harmonic_intvl_dmean                                            0.4674376
harmonic_intvl_dstd                                             0.3726956
volume_mean                                                      2.332568
fit_factor                                                   1.448876e-07
fit_fac_b                                                        99935.88
fit_fac_c                                                        317.3219
fit_lsq                                                          37.75918
Name: 1930, dtype: object
317.321908247

In [113]:
ax = seaborn.factorplot('valid', "volume_max",
                    data=df3, kind="box",
    )

for e in ax.axes:
    for ee in e:
        ee.set_ylim(0,1000)
nbio.show(seaborn.plt.gcf())



In [72]:
dfp = pandas.pivot_table(valid_df, values="valid",
                         index=["weekday"], columns=["hour"], 
                         aggfunc=np.sum).reindex(index='Mon Tue Wed Thu Fri Sat Sun'.split(),
                                                columns=range(24)
                                                )

In [73]:
type(dfp)


Out[73]:
pandas.core.frame.DataFrame

In [74]:
sns.plt.clf()
sns.heatmap(dfp)

nbio.show(sns.plt.gcf())



In [ ]: